package com.shyroke.spider;

import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.shyroke.entity.SpiderDataDto;

public class Sprider {

	private final String URL_PRE = "http://so.pinlue.com/cse/search?q=";
	private final String URL_SUFF="&click=1&s=16089285620727742084&nsid=";


	public List<SpiderDataDto> getData(String key){
		List<SpiderDataDto> dataList = new ArrayList<>();
		String a =URL_PRE+key+URL_SUFF;
		String result = HttpUtil.getInstance().httpGet(null, URL_PRE+key+URL_SUFF);
		
		Document root_document = Jsoup.parse(result);
		
		Element resultEle = root_document.getElementById("results");
		
		if(resultEle == null) {
			//搜索结果为空
			System.out.println("没有搜索到结果");
			return dataList;
		}
		
		Elements results = resultEle.getElementsByClass("result f s0");
		SpiderDataDto dataDto = null;
		for(int i=0;i<results.size();i++) {
		    dataDto = new  SpiderDataDto();
			Element ele = results.get(i);
			String title = ele.getElementsByClass("c-title").get(0).getElementsByTag("a").text();
			String url = ele.getElementsByClass("c-title").get(0).getElementsByTag("a").attr("href");
			
			if(ele.getElementsByClass("c-content").size()==0) {
				//广告则没有c-content ，原则上不抓取
				continue;
			}
			
			String context = ele.getElementsByClass("c-content").get(0).text();
			int index = context.indexOf("...");
			context =context.substring(0, index)+"...";
			
			dataDto.setContent(context);
			dataDto.setTitle(title);
			dataDto.setUrl(url);
			
			dataList.add(dataDto);
		} 
		
//		System.out.println(root_document);
		
		for(SpiderDataDto data :dataList) {
			System.out.println(data);
		}
		
		return dataList;
	}
	

}
